{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.spatial import distance\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exploring Wikipedia Distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compare(df, val1, val2):\n",
    "    return distance.euclidean(df.loc[val1], df.loc[val2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3D embedding distances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the wikipedia data in memory\n",
    "path = '../demo_embeddings/wikipedia_3000/iterations_250/perplexity_3/pca_25/learning_rate_10'\n",
    "embedding_df = pd.read_csv(path + f'/data.csv', index_col=0)\n",
    "embedding_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"the vs of:\", compare(embedding_df, 'the', 'of'))\n",
    "print(\"of vs to:\", compare(embedding_df, 'to', 'of'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Original Embeddings distances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "glove_df = pd.read_csv(\"../data/wikipedia_3000.csv\", index_col=0)\n",
    "glove_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Euclidean distances for Original 300D Glove embeddings:\")\n",
    "print(\"the, of:\", compare(glove_df, 'the', 'of'))\n",
    "print(\"of, to:\", compare(glove_df, 'to', 'of'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Speed comparison, 3D vs original"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Speed for 3D Embedding:\")\n",
    "%timeit compare(embedding_df, 'the', 'of')\n",
    "print(\"\\nSpeed for original Embedding (300D):\")\n",
    "%timeit compare(glove_df, 'the', 'of')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Speed for 3D Embedding:\")\n",
    "%timeit compare(embedding_df, 'to', 'of')\n",
    "print(\"\\nSpeed for original Embedding (300D):\")\n",
    "%timeit compare(glove_df, 'to', 'of')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see that the speeds are very similar to each other, and the computation is very efficient (less than a ms)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_word = 'united'\n",
    "selected_vec = glove_df.loc[selected_word]\n",
    "word_dict = pd.DataFrame(glove_df.index)\n",
    "\n",
    "def compare_pd(vector):\n",
    "    return distance.euclidean(vector, selected_vec)\n",
    "\n",
    "%timeit glove_df.apply(compare_pd, axis=1)\n",
    "distance_map = glove_df.apply(compare_pd, axis=1)\n",
    "print(distance_map.shape)\n",
    "distance_map.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This returns exactly what we want, which is a list of all the distances a certain word, in our case 'the', and does so in a reasonable amount of time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_word = 'united'\n",
    "selected_vec = embedding_df.loc[selected_word]\n",
    "\n",
    "def compare_pd(vector):\n",
    "    return distance.euclidean(vector, selected_vec)\n",
    "\n",
    "%timeit embedding_df.apply(compare_pd, axis=1)\n",
    "distance_map_3d = embedding_df.apply(compare_pd, axis=1)\n",
    "distance_map_3d.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We again notice similar performance for the 3D embedding. Therefore there's no good reason to use this one in particular, since the original embedding captures the distribution better in any case."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Testing preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "sorted_map = distance_map.sort_values()\n",
    "sorted_map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "sorted_3d_map = distance_map_3d.sort_values()\n",
    "sorted_3d_map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
